First, we need to read in the data and join the training and test data.
train_data <- read_csv("reddit_stress_data/dreaddit-train.csv")
## Rows: 2838 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv")
## Rows: 715 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_stress_data <- add_row(train_data, test_data)
Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.
words_tokenized <- reddit_stress_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_test <- test_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
words_tokenized_train <- train_data %>%
select(c("id", "text", "label", "subreddit")) %>%
unnest_tokens(word, text) %>%
mutate(word = gsub('[[:punct:]]+','', word)) %>%
mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
anti_join(stop_words)
## Joining, by = "word"
label_counts <- reddit_stress_data %>%
group_by(label) %>%
count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")
subreddit_counts <- reddit_stress_data %>%
group_by(subreddit) %>%
count()
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, kind = "bar")
## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#bar
## Warning: 'bar' objects don't have these attributes: 'kind'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'xperiod', 'yperiod', 'xperiod0', 'yperiod0', 'xperiodalignment', 'yperiodalignment', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
# Stacked
reddit_stress_data %>%
ggplot(aes(y=subreddit)) + geom_bar(aes(fill = as.factor(label)), position="stack")
Now let’s see the most common words among the data (overall).
GetTopNMostCommonWords <- function(df, num) {
top_word_counts <- df %>%
count(word) %>%
arrange(desc(n))
return (head(top_word_counts, num))
}
num <- 15
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)
Now I will plot the rop 20 most common words in the dataset
ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")
Now let’s see how this varies among label: stressed or non-stressed.
stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)
Now let’s plot them
ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")
Now let’s see the difference among stressed data.
ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")
# Part 4: Visualizing the Distribution of Sentiment ## Overall
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## Warning: Ignoring unknown parameters: bins
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")
## By Label
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
## Warning: Ignoring unknown parameters: bins
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)
## By Subreddit
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)
## Warning: Ignoring unknown parameters: bins
ggplot(reddit_stress_data, aes(x = sentiment)) + geom_boxplot(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label)
## Warning: Ignoring unknown parameters: bins